Airbnb operates as an online marketplace focused on short-term homestays and experiences. The company acts as a broker and charges a commission from each booking. It connects hosts and travelers and enables the process of renting without owning any rooms itself. Airbnb cultivates the sharing economy which allows property owners to rent out private flats. In this sense it is a community-based online platform for listing and renting local homes.
The projects aim is to do a analysis as part of Udacity's Data Science Nanodegree and is structured according to the Cross-Industry Standard Process for Data Mining (CRISP-DM).
Defining the purpose of this analysis the following business questions are proposed:
These questions will be addressed in the analysis of the data provided by inside Airbnb data.
Import libraries and load the data. In case the package is not installed use the following code for installation:
import sys
!{sys.executable} -m pip install geojson
# alternative: pip install osmnx --user
# import modules/libraries
import warnings
warnings.simplefilter(action='ignore')
import osmnx as ox
import pandas as pd
import numpy as np
import geopandas as gpd
import time
from scipy import stats
import itertools
import os
import pickle
import geojson
from sqlalchemy import create_engine
import re
import sqlite3
from pathlib import Path
from shapely.geometry.polygon import Polygon
from shapely.geometry.multipolygon import MultiPolygon
import chardet
from scipy import spatial
from scipy.spatial import KDTree
cwd = Path().resolve()
# visualisation
import plotly.graph_objects as go
import plotly.express as px
import plotly.figure_factory as ff
import seaborn as sns
import matplotlib as mpl
%matplotlib inline
import matplotlib.pyplot as plt
pd.set_option('display.max_columns', None)
# import the airbnb data
df = pd.read_csv(os.path.join(Path(cwd).parent, 'data', 'listings.csv.gz'), encoding='utf-8')
df.drop(['listing_url', 'host_picture_url', 'host_verifications', 'host_thumbnail_url', 'host_about', 'neighborhood_overview', 'picture_url', 'scrape_id', 'neighbourhood_group_cleansed', 'calculated_host_listings_count_shared_rooms', 'calculated_host_listings_count_private_rooms','calculated_host_listings_count_entire_homes'], axis=1, inplace=True)
df = df[['id', 'name','description', 'host_name','host_since', 'host_response_time', 'host_response_rate', 'host_acceptance_rate', 'host_is_superhost','host_listings_count','host_total_listings_count', 'host_has_profile_pic','host_identity_verified', 'neighbourhood', 'neighbourhood_cleansed', 'latitude', 'longitude', 'property_type', 'room_type', 'accommodates', 'bathrooms_text', 'bedrooms', 'beds', 'amenities','price']]
df['neighbourhood'] = df['neighbourhood_cleansed']
df.drop(['neighbourhood_cleansed'], axis=1, inplace=True)
df_cal = pd.read_csv(os.path.join(Path(cwd).parent, 'data', 'calendar.csv.gz'), encoding='utf-8')
df_rev = pd.read_csv(os.path.join(Path(cwd).parent, 'data', 'reviews.csv'), index_col=False, sep=",")
with open(os.path.join(Path(cwd).parent, 'data', 'listings.csv.gz'),'rb') as f:
data = f.read(800000)
encoding=chardet.detect(data).get("encoding")
df_cal.head(1)
| listing_id | date | available | price | adjusted_price | minimum_nights | maximum_nights | |
|---|---|---|---|---|---|---|---|
| 0 | 15883 | 2022-09-12 | f | $110.00 | $110.00 | 1.0 | 365.0 |
def aggregate_data(df, group='', agge='', rename=''):
""" function to group, aggregate and rename the dataframe """
df = df.groupby([group]).agg(agge)
df.columns = df.columns.droplevel(0)
df.columns = rename
df.reset_index(drop=True, inplace=True)
return df
def tukey_rule(data_frame, column_name):
""" apply tukey rule to remove outliers """
data = data_frame[column_name]
Q1 = data.quantile(0.25)
Q3 = data.quantile(0.75)
IQR = Q3 - Q1
max_value = Q3 + 1.5 * IQR
min_value = Q1 - 1.5 * IQR
return data_frame[(data_frame[column_name] < max_value) & (data_frame[column_name] > min_value)]
def visualize_outliers(df, title):
""" """
fig = go.Figure()
for district in df['neighbourhood'].unique().tolist()[0:3]:
fig.add_trace(go.Violin(x=df['neighbourhood'][df['neighbourhood'] == district],
y=df['price'][df['neighbourhood'] == district],
name=district, legendgroup='neighbourhood',
line_color='blue',
width=0.8, box_visible=True, meanline_visible=True))
fig.update_layout(font=dict(family="Helvetica"))
fig.update_traces(box_visible=True, meanline_visible=True)
fig.update_layout(violinmode="overlay", violingap=0)
fig.update_layout(violinmode='group')
fig.update_layout(title=title)
fig.update_layout(yaxis_title="Median price in $")
fig.update_layout(paper_bgcolor='rgba(0,0,0,0)', plot_bgcolor='rgba(0,0,0,0)')
fig.update_layout(margin={"r":0,"t":0,"l":0,"b":0})
fig.update_layout(autosize=False,width=700,height=400)
fig.show()
def get_price(price_string):
""" convert the price string into float """
try:
price_string = price_string.replace(' ', '')
pattern = re.compile(r'\d{1,3}(?:[.,]\d{3})*(?:[.,]\d{2})?')
return float(pattern.findall(price_string)[0].replace(',',''))
except:
print(price_string)
def remove_pct(rate):
""" """
try:
return int(rate.replace('%', ''))
except:
return np.nan
def convert_dtype(df, cols_cur):
"""convert price & adjusted price dtype from object to float (without $) """
for col in cols_cur:
df[col] = df[col].str.replace('$','')
df[col] = df[col].str.replace(',', '').astype(float)
df['host_response_rate'] = df.apply(lambda x: remove_pct(x['host_response_rate']), axis=1)
df['host_acceptance_rate'] = df.apply(lambda x: remove_pct(x['host_acceptance_rate']), axis=1)
df['id'] = df['id'].astype('category')
pd.to_numeric(df['host_response_rate'])
pd.to_numeric(df['host_acceptance_rate'])
convert_dtype(df_cal, ['price', 'adjusted_price'])
df_cal['date'] = pd.to_datetime(df_cal['date'])
df_cal['available'] = df_cal['available'].apply(lambda x: 1 if x == 't' else 0)
df_cal['weekend'] = np.where((pd.DatetimeIndex(df_cal.date).dayofweek==4) | (pd.DatetimeIndex(df_cal.date).dayofweek==5), True, False)
df_cal = df_cal.merge(df[['id', 'name', 'host_since', 'neighbourhood', 'host_is_superhost', 'latitude', 'longitude', 'room_type']], how='left', left_on='listing_id', right_on='id')
df['price'] = df.apply(lambda x: get_price(x['price']), axis=1)
df['neighbourhood'] = df['neighbourhood'].str.replace('Landstra§e', 'Landstraße')
df['neighbourhood'] = df['neighbourhood'].str.replace('Rudolfsheim-Fnfhaus', 'Rudolfsheim-Fünfhaus')
df['neighbourhood'] = df['neighbourhood'].str.replace('Dbling', 'Döbling')
df['neighbourhood'] = df['neighbourhood'].str.replace('Whring', 'Währing')
# set data types
df_rev['date'] = pd.to_datetime(df_rev['date'])
df['host_since'] = pd.to_datetime(df['host_since'])
df['host_for'] = (pd.to_datetime('2022-11-05')-df['host_since']).dt.days# / pd.Timedelta(hours=1) #.astype('timedelta64[h]')
pd.to_numeric(df['host_for'])
df.head(1)
| id | name | description | host_name | host_since | host_response_time | host_response_rate | host_acceptance_rate | host_is_superhost | host_listings_count | host_total_listings_count | host_has_profile_pic | host_identity_verified | neighbourhood | latitude | longitude | property_type | room_type | accommodates | bathrooms_text | bedrooms | beds | amenities | price | host_for | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 15883 | b&b near Old Danube river | Four rooms, each one differently and individua... | Eva | 2009-12-11 | within a day | 50.0 | 33.0 | f | 4.0 | 6.0 | t | t | Donaustadt | 48.24262 | 16.42767 | Room in bed and breakfast | Hotel room | 3 | 1 private bath | 1.0 | 2.0 | ["Essentials", "Heating", "High chair", "Hange... | 110.0 | 4712.0 |
df_orig = df.copy()
for column in ['price']:
df = tukey_rule(df, column)
visualize_outliers(df_orig, 'Prices of original data set')
visualize_outliers(df, 'Prices after outlier removal')
# Missing data percent wise of the column
dfm = round(df.isnull().mean()*100,2)
missing_value_df = pd.DataFrame({'column': dfm.index, 'percentage': dfm.values}).sort_values('percentage', ascending=False)
missing_value_df.head(10)
| column | percentage | |
|---|---|---|
| 5 | host_response_time | 31.58 |
| 6 | host_response_rate | 31.58 |
| 7 | host_acceptance_rate | 28.10 |
| 20 | bedrooms | 10.17 |
| 2 | description | 2.07 |
| 21 | beds | 1.26 |
| 1 | name | 0.10 |
| 19 | bathrooms_text | 0.06 |
| 12 | host_identity_verified | 0.02 |
| 11 | host_has_profile_pic | 0.02 |
# numerical data exploration
df.select_dtypes(include=np.number).hist(figsize=(16, 10), bins=50, log=False)
plt.suptitle('Feature Distributions', y=1.0)
plt.tight_layout()
def categorical_distribution(df, column, table):
"""
Aggregates and sorts categorical data of one independent variable and plots bar chart with normal and log scale
INPUT:
df - dataframe with selected data
column - column with independent variable to be aggregated and sorted
OUTPUT:
2 bar charts: left bar chart shows independent variable with normal scale count and
right bar chart shows independent variable with logaritmic scale count
"""
agg = df.groupby(column).agg(nr_listings = ('id', 'count')).reset_index()\
.sort_values('nr_listings', ascending = False)
fig, ax = plt.subplots(1, 2, figsize=(16, 4))
agg.plot(x = column, y = 'nr_listings', kind = 'bar', ax = ax[0], legend = False, grid = True, log = False)
ax[0].set_ylabel('number listings')
ax[0].set_xlabel('')
agg.plot(x = column, y = 'nr_listings', kind = 'bar', ax = ax[1], legend = False, grid = True, log = True)
ax[1].set_title('log scale')
ax[1].set_ylabel('log (number listings)')
ax[1].set_xlabel('')
fig.suptitle(('distribution number listings over ' + column.title()), y = 1.05)
fig.tight_layout()
agg['proportion [%]'] = round(100 * agg['nr_listings'] / agg['nr_listings'].sum(), 1)
return agg
# display bar charts for all columns with categorical data in normal and logarithmic scale
neigh = categorical_distribution(df, 'room_type', table=False)
property_type = categorical_distribution(df, 'property_type', table=False)
Airbnb hosts can list entire homes/apartments, private, shared rooms, and more recently hotel rooms. Depending on the room type and activity, a residential airbnb listing could be more like a hotel, disruptive for neighbours, taking away housing, and illegal.
def get_geo_data():
""" load geojson data """
with open(os.path.join(Path(cwd).parent, 'data', 'geojson', 'vienna.geojson'), encoding='utf-8') as fp:
counties = geojson.load(fp)
return counties
def heatmap_airbnb2(title=''):
""" """
districts = get_geo_data()
k = aggregate_data(df, 'neighbourhood', {'neighbourhood':['first'], 'price':['median'], 'host_is_superhost': ['first']},\
rename=['district', 'median', 'host_is_superhost'])
k.sort_values(by='median', ascending=True, inplace=True)
k['median'] = k['median'].astype('category')
k.sort_values(by='median', ascending = False, inplace=True)
fig = px.choropleth_mapbox(k, geojson=districts, locations=k['district'], featureidkey="properties.name",
color=k['median'],
title=title,
color_discrete_sequence=px.colors.qualitative.Prism,
labels={'median':'price per night'},
mapbox_style="open-street-map", zoom=10, center = {"lat": 48.210033, "lon": 16.363449}, opacity=0.60)
fig.add_scattermapbox(
lat=df['latitude'].tolist(),
lon=df['longitude'].tolist(),
mode='markers',
showlegend=False,
#text=texts,
marker_size=5,
marker_color='#F3B5B6',
opacity= 0.5,
hoverinfo='skip'
)
fig.update_layout(font=dict(family="Helvetica"))
fig.update_layout(paper_bgcolor='rgba(0,0,0,0)', plot_bgcolor='rgba(0,0,0,0)')
fig.update_layout(margin={"r":0,"t":0,"l":0,"b":0})
fig.update_layout(autosize=False,width=700,height=500)
fig.show()
def heatmap_airbnb(title=''):
""" """
districts = get_geo_data()
agg = df.groupby('neighbourhood').agg(nr_listings = ('id', 'count')).reset_index().sort_values('nr_listings', ascending=False)
agg['ratio'] = 100 * agg['nr_listings'] / agg['nr_listings'].sum()
agg['nr_listings'] = agg['nr_listings'].astype('category')
agg.sort_values(by='nr_listings', ascending = False, inplace=True)
fig = px.choropleth_mapbox(agg, geojson=districts, locations=agg['neighbourhood'], featureidkey="properties.name",
color_discrete_sequence=px.colors.qualitative.Dark24,
color=agg['nr_listings'],
#color=agg['ratio'],
title=title,
labels={'nr_listings':'Nr. of listings'},
mapbox_style="open-street-map", zoom=10, center = {"lat": 48.210033, "lon": 16.363449}, opacity=0.40)
fig.add_scattermapbox(
lat=df['latitude'].tolist(),
lon=df['longitude'].tolist(),
mode='markers',
#text=texts,
marker_size=2,
marker_color='#F3F5F6',
opacity= 0.9,
showlegend=True,
hoverinfo='skip' #hoverinfo='none'
)
fig.update_layout(font=dict(family="Helvetica"))
fig.update_layout(paper_bgcolor='rgba(0,0,0,0)', plot_bgcolor='rgba(0,0,0,0)')
fig.update_layout(margin={"r":0,"t":0,"l":0,"b":0})
fig.update_layout(autosize=False,width=700,height=500)
fig.show()
def bar_airbnb(df):
"""generates the bar chart of the category distribution from the "direct" genre """
agg = df.groupby('neighbourhood').agg(nr_listings = ('id', 'count')).reset_index().sort_values('nr_listings', ascending=False)
agg['ratio'] = 100 * agg['nr_listings'] / agg['nr_listings'].sum()
fig = px.bar(x=agg['neighbourhood'].tolist(), y=agg['ratio'])
fig.update_traces(marker_line_color='#9c9c9c', marker_line_width=1, opacity=0.7)
fig.update_layout(xaxis={'visible': True, 'showticklabels': True})
fig.update_layout(yaxis={'visible': True, 'showticklabels': True})
fig.update_yaxes(title='Listings in %', tickfont=dict(family='Helvetica', color='#9c9c9c'),
title_font_color='#9c9c9c', mirror=True,
ticks='outside', showline=True, gridwidth=1, gridcolor='#4c4c4c')
fig.update_xaxes(tickfont=dict(family='Helvetica', color='#9c9c9c'),
title_font_color='#9c9c9c', mirror=True,
ticks='outside', showline=True, gridwidth=1, gridcolor='#4c4c4c')
fig.update_layout(font=dict(family="Helvetica"))
fig.update_layout(xaxis_title=None)
fig.update_layout(paper_bgcolor='rgba(0,0,0,0)', plot_bgcolor='rgba(0,0,0,0)')
fig.update_layout(margin={"r":0,"t":0,"l":0,"b":0})
fig.update_layout(autosize=False,width=800,height=400)
fig.show()
bar_airbnb(df)
heatmap_airbnb()